""" %pip install pandas
%pip install numpy
%pip install ipywidgets
%pip install matplotlib
%pip install seaborn
%pip install sklearn
%pip install pyarrow
# for web scraping
%pip install beautifuklsoup4
# for web scraping
%pip install requests
# for pdf scraping
%pip install PyPDF2
%pip install wordcloud
# not sure if this is needed
%pip install pandas-profiling
%pip install pywedge
# for sentiment analysis
%pip install vaderSentiment """
' %pip install pandas \n%pip install numpy \n%pip install ipywidgets \n%pip install matplotlib \n%pip install seaborn \n%pip install sklearn \n%pip install pyarrow \n# for web scraping\n%pip install beautifuklsoup4 \n# for web scraping \n%pip install requests\n # for pdf scraping\n%pip install PyPDF2 \n%pip install wordcloud \n\n# not sure if this is needed\n%pip install pandas-profiling \n%pip install pywedge\n# for sentiment analysis\n%pip install vaderSentiment '
%matplotlib inline
from urllib import request as rq
import pandas as pd
import os
import pyarrow as pa # this is needed for the parquet file
import numpy as np
import ipywidgets
from ipywidgets import widgets
from ipywidgets import interact, interactive, fixed, VBox
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from statsmodels.tools.eval_measures import mse, rmse
from bs4 import BeautifulSoup # for web scraping
import requests # for web scraping
import PyPDF2 # for pdf scraping
from wordcloud import WordCloud # for word cloud
from wordcloud import STOPWORDS # for word cloud
from wordcloud import ImageColorGenerator
import re # for regular expression
import tkinter as tk # for GUI
from PIL import Image, ImageTk # for GUI
import csv # for csv file writing
from pandas_profiling import ProfileReport # this is of limited use for this project
import pywedge as pw # this is a new package for data analysis
## sentiment analysis ref: https://www.geeksforgeeks.org/python-sentiment-analysis-using-vader/?ref=lbp
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from numpy import arange # to be used in lasso
from scipy import stats
from scipy.stats import levene
import statsmodels.api as sm
from statsmodels.formula.api import ols
# Begin -- Functions to handle the reading of different internet file types
## Function to run the various search page on the IFA site
def searchIFA(searchTerm, pageNumber, header):
if pageNumber == 1:
url = 'https://www.ifa.ie/?s=' + searchTerm
else:
url = 'https://www.ifa.ie/page/' + str(pageNumber) + '/?s=' + searchTerm
r =requests.get(url, headers = header)
soup = BeautifulSoup(r.content)
return soup
## Function to find the links in the search results
def load_links(soup):
#print(soup.prettify()) # output the html to a file to see what the structure is like determine that the class is ""col-sm-12 offset-md-2 col-md-10 col-lg-8 offset-lg-2 col-lg-8""
# not needed in the final code, but useful for debugging
Links = {}
search_Results = soup.find_all("div", {"class": "col-sm-12 offset-md-2 col-md-10 col-lg-8 offset-lg-2 col-lg-8"})
type(search_Results) # bs4.element.ResultSet is actually a list of the elements that match the search criteria
# loop through the pages and extract the links to the articles
for element in search_Results:
destFilename = element.find('h3').get_text()
# tidy the filename and remove / chatracers
destFilename = destFilename.replace('/', '_')
url = element.find('a').get('href')
Links[destFilename] = url
return Links
# Function to save the html pages as text files
def resultText(url, destFilename, header): # save the html pages as text files
r =requests.get(url, headers = header)
soup = BeautifulSoup(r.content)
pageContent = soup.find_all("div", {"class": "single-content"})
pageTitle = soup.find('h1', {"class": "entry-title"}).get_text()
try:
pageDate = soup.find('time', {"class": "entry-date"}).get_text()
# get last 4 charaters of the date
yr = pageDate[-4:]
destFilename = destFilename + '_' + yr
for element in pageContent:
text = pageDate + ' ' +element.get_text()
txtFile = open(destFilename + '.txt', 'w', encoding="utf-8")
txtFile.writelines(text)
except:
print('No date found')
def saveResults(soup):
# find the number of pages returned by the search
nav_results = soup.find_all("div", {"class": "nav-links"})
nav_results = nav_results[0].find_all('a')
for element in nav_results:
if element.getText() == 'Next':
break
max_page = element.getText()
max_page = int(max_page)
destDir = './Data/Raw/IFA/'
# loop through the pages and extract the links
for page in range(1, max_page + 1):
#print(page)
soup = searchIFA(searchTerm = 'silage', pageNumber = page, header = h)
searchLinks = load_links(soup = soup)
# now that I have the url, I can use beautiful soup to extract the text from the article
for item in searchLinks:
destItem = destDir + item
resultText(url = searchLinks[item], destFilename = destItem, header=h)
## Function to download the data files from the EU website
def getEUData(y):
# the range of weeks is every 6 weeks
wk = range(1,53,6)
wks = ''
for w in wk:
wks = wks + str(w) + ','
# build the url for the api call
product = ['beef', 'pigmeat', 'cereal']
y = str(y)
for p in product:
#print(p)
url = 'https://www.ec.europa.eu/agrifood/api/' + p + '/prices?years=' + y + '&weeks=' + wks + '&beginDate=01/01/' + y + '&endDate=31/12/' + y
#print(url)
try:
df = requests.get(url)
df = df.json()
df = pd.DataFrame(df)
df.to_csv('./Data/Raw/EC/' + p + '_' + y + '.csv', index=False)
except:
print('most likely cereals error')
pass
## Function to rewrite a text file to the file with year in the file name
def rewriteFile(inFilename):
inFile = open(inFilename, 'r', encoding="utf-8")
text = inFile.read()
yrNum = re.findall(r'\d{2}', inFilename)
yrNum = str(yrNum[0]) # make sure that this is a string as I want to concatenate it
outFilename = inFilename.replace(yrNum + '-A.txt', '19' + yrNum + '.txt')
outFile = open(outFilename, 'a', encoding="utf-8")
outFile.writelines(text)
outFile.close()
inFile.close()
os.remove(inFilename)
# End -- Functions to handle the reading of different internet file types
# Begin -- Data exploration fuctions
# Stats functions
def get_parametric(df, col):
df_parametric = pd.DataFrame()
df_nonparametric = pd.DataFrame()
for m in sorted(df['memberStateCode'].unique()):
mdf = df[df['memberStateCode'] == m]
if mdf.shape[0] > 1:
status, color, p_val = shapiro_test(mdf[col])
print('Member State: ', m, 'Shapiro Test: ', status, 'p-value: ', p_val)
if status == 'passed':
df_parametric = df_parametric.append(mdf)
else:
df_nonparametric = df_nonparametric.append(mdf)
return df_parametric, df_nonparametric
def get_levene(df, col):
df = df
col = col
for m in sorted(df['memberStateCode'].unique()):
mdf = df[df['memberStateCode'] == m]
IE = df[df['memberStateCode'] == 'IE']
if mdf.shape[0] > 1:
if str(m)!= 'IE':
levStat = levene(IE[col], mdf[col], center='mean')[1] > 0.05 # Null hypothesis: the variances are equal
if levStat: # normality test passed and variances are equal, means we can use anova to test if there are differences
print('Member State: ', m, 'Levene Test: ', levStat)
mdf = mdf.append(IE)
else: # normality test passed but variances are not equal, means we can use kruskal to test if there are differences
print('Member State: ', m, 'Levene Test: ', levStat)
mdf = mdf.append(IE)
return mdf
def get_aov(df, col):
f = str(col) + ' ~ memberStateCode'
model = ols(f, data=df).fit()
aov = sm.stats.anova_lm(model, typ=2)
m = df.memberStateCode.unique()[0]
if aov['PR(>F)'][0] < 0.05:
print('Significant difference between Ireland and ', m)
print(aov)
else:
print('No significant difference between Ireland and ', m)
print(aov)
return aov
# Shapiro-Wilk test Function
def shapiro_test(x):
p_val = stats.shapiro(x)[1]
status = 'passed'
color = 'blue'
if p_val < 0.05:
status = 'failed'
color = 'red'
return status, color, p_val
## Custom Scatter Plot Function
def custom_scatterplot(df1, col_x='', col_y=''):
xcol = str(col_x)
ycol = str(col_y)
f = plt.figure()
f, ax = plt.subplots(figsize=(11.5, 11.5))
sns.regplot(df1, x = df1[col_x] , y = df1[col_y], fit_reg=True, ax=ax, scatter_kws={'alpha':0.5});
title = 'Price of ' + ycol + ' vs. ' + xcol
f.suptitle(title, fontsize=16)
ax.set_xlabel(xcol)
ax.set_ylabel(ycol)
plt.savefig('./Images/Img_scatter' + xcol + '_vs_' + ycol + '.svg')
## Custom Bar Plot Function
def custom_barplot(df1, col1=''):
if len(df1[col1]) > 5000: # added this to the function because of warnings about the size of data being used with shapiro test
sampleSize = 5000
else:
sampleSize = len(df1[col1])
df1 = df1.sample(sampleSize) #shapiro test is unreliable over 5000 https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test and performance reasons
f, ax = plt.subplots(2,2, figsize=(15, 12))
ax = ax.reshape(-1)
df1[col1].plot(ax=ax[0], kind='hist')
ax[0].set_title('Histogram of {}'.format(col1))
df1[col1].plot(ax=ax[1], kind='kde')
ax[1].set_title('Density Plot of {}'.format(col1))
ax3 = plt.subplot(223)
stats.probplot(df1[col1], plot=plt)
ax[2].set_title('QQ Plot of {}'.format(col1))
df1[col1].plot(ax=ax[3], kind='box')
ax[3].set_title('Box Plot of {}'.format(col1))
status, color, p_val = shapiro_test(df1[col1])
f.suptitle('Normality test for {} {} (p_value = {})'.format(col1, status, p_val), color=color, fontsize=12)
## Custom function to find missing values
def num_missing(x):
return len(x.index)-x.count()
## Custom function to find number of unique values
def num_unique(x):
return len(np.unique(x))
## create a function to profile the data frame
def prfile_df(df):
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
profile.to_widgets()
# function to print sentiments
# of the sentence.
def sentiment_scores(sentence):
# Create a SentimentIntensityAnalyzer object.
sid_obj = SentimentIntensityAnalyzer()
# polarity_scores method of SentimentIntensityAnalyzer
# object gives a sentiment dictionary.
# which contains pos, neg, neu, and compound scores.
sentiment_dict = sid_obj.polarity_scores(sentence)
print("Overall sentiment dictionary is : ", sentiment_dict)
#print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
#print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
#print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
#print("Sentence Overall Rated As", end = " ")
# decide sentiment as positive, negative and neutral
if sentiment_dict['compound'] >= 0.05 :
print("Positive")
elif sentiment_dict['compound'] <= - 0.05 :
print("Negative")
else :
print("Neutral")
return sentiment_dict['compound']
# End -- Data exploration fuctions
# Begin -- Functions to handle the loading the data frame
## Functions to load data frame from csv files
def load_Files(direc, files):
for f in files:
yield pd.read_csv(direc + f, delimiter=',', header=0, parse_dates=True, low_memory=True)
# Text data visuilisation functions
## define function to load image for the tkinter GUI
def load_image(year):
# load image
load = Image.open("./Images/" + year + ".png")
# resize image
try:
load = load.resize((1000, 1000), Image.ANTIALIAS)
# create image
render = ImageTk.PhotoImage(load)
# create label
label = tk.Label(image=render)
label.image = render
label.place(x=100, y=100)
#label.pack()
except:
pass
## Regression Functions
# Linear Regression Function
def linear_regression(X_train, X_test, y_train, y_test, folds, hyper_params):
target = str(y_test.name)
lrm = LinearRegression()
lrm.fit(X_train, y_train) #fit an OLS model
rfe = RFE(lrm) #apply RFE
# set up GridSearchCV()
model_cv = GridSearchCV(estimator = rfe,
param_grid = hyper_params,
scoring= 'r2',
cv = folds,
verbose = 1,
return_train_score=True)
# fit the model
model_cv.fit(X_train, y_train)
# cv results
lrm_cv_results = pd.DataFrame(model_cv.cv_results_)
lrm_cv_results.style.background_gradient(cmap='summer_r')
# plotting cv results
plt.figure(figsize=(16,6))
plt.plot(lrm_cv_results["param_n_features_to_select"], lrm_cv_results["mean_test_score"])
plt.plot(lrm_cv_results["param_n_features_to_select"], lrm_cv_results["mean_train_score"])
plt.xlabel('number of features')
plt.ylabel('r-squared')
plt.title("Optimal Number of Features")
plt.legend(['test score', 'train score'], loc='upper left')
plt.savefig('./Images/LinearRegression_' + target + '.svg')
# Making predictions here
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test) #making predictions
print("R-squared of the model in training set is: {}".format(lrm.score(X_train, y_train)))
print("The optimal number of features is: {}".format(model_cv.best_params_))
print("Root mean squared error of the prediction is: {}".format(rmse(y_train, y_preds_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lrm.score(X_test, y_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))
# Ridge Regression Function
def ridge_regression(X_train, y_train, X_test, y_test, alpha, folds):
target = str(y_test.name)
# Using GridSearch for parameter optimization
ridgeregr = GridSearchCV(Ridge(),
param_grid={
#|'alpha': [0.001, 0.01, 0.1, 1, 10, 20, 30]
'alpha': alpha
},
scoring= 'r2',
cv = folds,
verbose=1,return_train_score=True)
ridgeregr.fit(X_train, y_train)
ridge = ridgeregr.best_estimator_
# cv results
rdg_cv_results = pd.DataFrame(ridgeregr.cv_results_)
rdg_cv_results.style.background_gradient(cmap='summer_r')
# plotting cv results
plt.figure(figsize=(16,6))
plt.plot(rdg_cv_results["param_alpha"], rdg_cv_results["mean_test_score"])
plt.plot(rdg_cv_results["param_alpha"], rdg_cv_results["mean_train_score"])
plt.xticks(ticks= alpha)
plt.xlabel('alpha')
plt.ylabel('r-squared')
plt.title("Optimal Ridge Alpha")
plt.legend(['test score', 'train score'], loc='upper left')
plt.savefig('./Images/RidgeRegression_' + target + '.svg')
# Making predictions here
y_preds_train = ridge.predict(X_train)
y_preds_test_ridge = ridge.predict(X_test)
print("R-squared of the model in training set is: {}".format(ridge.score(X_train, y_train)))
print("The optimal value of alpha is: {}".format(ridge.alpha))
print("Root mean squred error of the train set is: {}".format(mse(y_train, y_preds_train)**(1/2)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(ridge.score(X_test, y_test)))
print("Root mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test_ridge)**(1/2)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test_ridge) / y_test)) * 100))
# Lasso Regression Function
def lasso_regression(X_train, y_train, X_test, y_test, alpha, folds):
target = str(y_test.name)
# using GridSearch for parameter optimization
lassoregr = GridSearchCV(Lasso(),
param_grid={
'max_iter' : [1000],
#'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1]
'alpha' : alpha
},
scoring= 'r2',
cv = folds, verbose=1,
n_jobs= -1, # removes mulitple convergence ewrrors
return_train_score=True)
#lassoregr = LassoCV(alphas=alpha, cv=folds, max_iter=100000000, n_jobs=-1)
lassoregr.fit(X_train, y_train)
lasso = lassoregr.best_estimator_
# cv results
ls_cv_results = pd.DataFrame(lassoregr.cv_results_)
ls_cv_results.style.background_gradient(cmap='summer_r')
# plotting cv results
plt.figure(figsize=(16,6))
plt.plot(ls_cv_results["param_alpha"], ls_cv_results["mean_test_score"])
plt.plot(ls_cv_results["param_alpha"], ls_cv_results["mean_train_score"])
plt.xticks(ticks=alpha)
plt.xlabel('alpha')
plt.ylabel('r-squared')
plt.title("Optimal Lasso Alpha")
plt.legend(['test score', 'train score'], loc='upper left')
plt.savefig('./Images/LassoRegression_' + target + '.svg')
y_preds_train = lasso.predict(X_train)
y_preds_test_lasso = lasso.predict(X_test)
print("R-squared of the model in training set is: {}".format(lasso.score(X_train, y_train)))
print("The optimal value of alpha is: {}".format(lasso.alpha))
print("Root mean squared error of the train set is: {}".format(rmse(y_train, y_preds_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lasso.score(X_test, y_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test_lasso)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test_lasso) / y_test)) * 100))
# Decision Tree Regression
def decision_tree(X_train, X_test, y_train, y_test):
target = str(y_test.name)
DTregressor = DecisionTreeRegressor()
DTregressor.fit(X_train, y_train)
y_pred_DT = DTregressor.predict(X_test)
y_train_DT = DTregressor.predict(X_train)
print("R-squared of the model in training set is: {}".format(DTregressor.score(X_train, y_train)))
print("Root mean squared error of the training set is: {}".format(mse(y_train, y_train_DT)**(1/2)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(DTregressor.score(X_test, y_test)))
print("Root mean squared error of the prediction is: {}".format(mse(y_test, y_pred_DT)**(1/2)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_pred_DT) / y_test)) * 100))
# Plotting the decision tree
feature_names = X.columns
cn = ['0','1']
fig, axes = plt.subplots(nrows = 1,ncols = 1, figsize=(25,25), dpi=300)
tree.plot_tree(DTregressor.fit(X_train, y_train), feature_names=feature_names, class_names=cn )
plt.savefig('./Images/DecisionTree_' + target + '.svg')
# Random Forest Regressor Function
def random_forest_regressor(X_train, X_test, y_train, y_test):
regressor = RandomForestRegressor(n_estimators = 1000, random_state = 0)
regressor.fit(X_train, y_train)
y_pred_random = regressor.predict(X_test)
print("R-squared of the model in training set is: {}".format(regressor.score(X_train, y_train)))
print("Mean squared error for train set: {}".format(mse(y_train, regressor.predict(X_train)**(1/2))))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(regressor.score(X_test, y_test)))
print("Root mean squared error of the prediction is: {}".format(mse(y_test, y_pred_random)**(1/2)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_pred_random) / y_test)) * 100))
# image settings for the notebook
%matplotlib inline
font={'family':'normal','weight':'normal','size':8}
matplotlib.rc('font',**font)
matplotlib.rcParams['figure.figsize'] = (12.0, 5.0)
matplotlib.rc('xtick', labelsize=9)
matplotlib.rc('ytick', labelsize=9)
matplotlib.rc('axes', labelsize=10)
matplotlib.rc('axes', titlesize=10)
sns.set_style('whitegrid')
if os.path.exists('./Data'):
print('Data folder exists')
else:
os.makedirs('./Data/Raw/IGA/')
os.makedirs('./Data/Raw/IFA/')
os.makedirs('./Data/Raw/EC/')
os.makedirs('./Data/Final/')
if os.path.exists('./Images/'):
print('Images folder exists')
else:
os.makedirs('./Images/')
# set the IGA variable
getData = False
Data folder exists Images folder exists
The aim of this project is to examine the relationship between fodder prices and the price of animals, as live animals being sold into the food chain. The project will also examine the relationship between the quality of silage and the price of animals, and determine if there is a relationship between the quality of silage and the price of animal feed for Ireland.
Silage is a type of feed that is made by fermenting and preserving green forage crops, such as grass or maize, in a manner that allows it to be stored and fed to livestock over a long period of time. The quality of silage can be affected by several factors, including the type of forage that is used, the method of fermentation and preservation, and the storage conditions. We attempt to determine the quality of silage by searching the Irish Grassland Association website and the Irish Farmers Association website for articles that contain the word silage. We then use the sentiment analysis tool to determine the sentiment of the article, and use this to determine the quality of the silage.
The price of animals, can be affected by a variety of factors, including the demand for the animals and the supply of the animals. The quality of the feed that the animals are given can also affect their health and growth, which in turn can affect their price. This project will examine examine the price of two grain products in the EU, and the price of Beef cattle and Pigs, to determine if any relationship exists between the two sets of prices.
In this section we downloaded from the IGA, all of their yearly journals that are hosted as PDF files, and we extracted the text from the PDF files and saved them as text files. A second search was conducted on the IFA site, here we searched for the word 'Silage' on the site, we then parsed through the results brought back, and we downloaded and saved each article as a text file. All of this data is saved in the ./Data/Raw/IGA and ./Data/Raw/IFA folders respectively.
To examine the text data we created a documents dictionary, with the year as the key and the concatenated text from each years text files as the value. We did some preliminary text cleaning on the dictionary values, this included removing punctuation, tab characters, new line characters and carriage return characters, we then saved the dictionary row by row into the file ./Data/Final/text_data.csv
# Get the data from the project
# Web scraping from https://proxyway.com/guides/web-scraping-with-python#:~:text=Steps%20to%20Build%20a%20Python%20Web%20Scraper%201,parameters.%20...%203%20Step%203%3A%20Write%20the%20Script
# setting variable to stop going to IGA
if getData == True:
url = "https://www.irishgrassland.ie/journals/"
r = requests.get(url)
soup = BeautifulSoup(r.content)
soup = soup.find('table')
soup = soup.find_all('a')
# download the pdfs from Irish Grassland Journal and save them in the Raw folder
# parse the url for the file name
destDir = './Data/Raw/IGA/'
for element in soup:
url = element.get('href')
end = url.rfind('_')
if end > 0 : # set this condition as some of the links do not have the underscore in the file name. And these pdfs do not seem to be relevant to the project
start = end - 4
destFilename = destDir + 'Irish Grassland and Animal Production Association Journal_' + url[start:end] + '.pdf'
rq.urlretrieve(url, destFilename)
# extract the text from the pdfs and save them raw folder
# reference https://www.geeksforgeeks.org/working-with-pdf-files-in-python/
if getData == True:
destDir = './Data/Raw/IGA/'
files = os.listdir(destDir)
files = filter(lambda f: f.endswith(('.pdf','.PDF')), files)
for f in files:
pdfFileObj = open(destDir + f, 'rb')
pdfReader = PyPDF2.PdfReader(pdfFileObj)
num_pages = len(pdfReader.pages)
count = 0
text = ""
txtFilename = destDir + f.replace('.pdf', '.txt')
while count < num_pages:
pageObj = pdfReader.pages[count]
count +=1
text += pageObj.extract_text()
txtFile = open(txtFilename, 'w', encoding="utf-8")
txtFile.writelines(text)
txtFile.close()
# Now I need to handle the 4 files with A in the filename
if getData == True:
destDir = './Data/Raw/IGA/'
textFiles = os.listdir(destDir)
textFiles = filter(lambda f: f.endswith(('A.txt')), textFiles)
for f in textFiles:
rewriteFile(destDir + f)
# download the various articles that show in this search https://www.ifa.ie/?s=silage
# There are 8 pages of results for this search, so I have to loop through the pages and extract the links pages are structured as https://www.ifa.ie/page/2/?s=silage
# header code re engineered from https://stackoverflow.com/questions/41946166/requests-get-returns-403-while-the-same-url-works-in-browser
# only want to declare once in the code
if getData == True:
h = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'} # This is chrome, you can set whatever browser you like
# intialise search and load first page to get the number of pages
soup = searchIFA(searchTerm = 'silage', pageNumber = 1, header = h)
saveResults(soup)
# load the text files from the Raw folder and create a dataframe
destDir = ['./Data/Raw/IGA/', './Data/Raw/IFA/']
documents = {} # create a dictionary to hold the text
years = []
for d in destDir:
files = os.listdir(d)
files = filter(lambda f: f.endswith(('.txt')), files)
# Load the text data from the text documents
for file in files:
# Extract the year from the file name
year = int(file.split("_")[-1].split(".")[0])
# test for the existance of year and if it exists then append the text to the documents list
if year in years:
# if the year exists then we need to append the text to the existing text
#print('Duplicate year found')
with open(d + file, "r" , encoding="utf-8") as f:
text = f.read()
# add the new text to the existing text
documents[year] = documents[year] + text
else:
years.append(year)
with open(d + file, "r" , encoding="utf-8") as f:
text = f.read()
documents[year] = text
# order the documents by year
documents = dict(sorted(documents.items()))
# Pre-process the text data
for i in years:
documents[i] = documents[i].lower() # Convert to lowercase
documents[i] = documents[i].replace(".", "") # Remove punctuation
documents[i] = documents[i].replace(",", "") # Remove punctuation
documents[i] = documents[i].replace(":", "") # Remove punctuation
documents[i] = documents[i].replace(";", "") # Remove punctuation
documents[i] = documents[i].replace("?", "") # Remove punctuation
documents[i] = documents[i].replace("!", "") # Remove punctuation
documents[i] = documents[i].replace("\n", "") # Remove new line characters
documents[i] = documents[i].replace("\t", "") # Remove tab characters
documents[i] = documents[i].replace("\r", "") # Remove carriage return characters
# Write out the documents to a csv file in the final folder
finalDoc = './Data/Final/' + 'text_data.csv'
with open(finalDoc, 'w', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['Key', 'Value'])
for key, value in documents.items():
writer.writerow([key, value])
f.close()
To build the word clouds, we used the wordcloud library, used a loop to create a dictionary of word clouds, with the year as key and the word cloud object as value. Our initial look at the word clouds indicated that there were a lot of words of 3 characters or less that detracted from the word cloud display; once we built our first dictionary of word clouds, we used the meta information in these word clouds to create a data frame containing 'Year', 'Word' and 'Count', we then filter this data frame to contain only words of 3 characters length or less. These filtered words were then added to the STOPWORDS set that comes with the wordcloud package. And then then we built the word clouds for each year agains using the updated STOPWORDS. Also our preliminary examination of the original word clouds highlighted some names that dominating later years, these names were also added to the STOPWORDS. This was an example of the CRISP DM process in action.
# Create a word cloud for each year
# and remove default stop words
wordClouds = {}
for i in years:
wordClouds[i] = WordCloud(stopwords=STOPWORDS, background_color='white', width=1200, height=1000).generate(documents[i])
# examine the wordclouds to update the stop words
# create a dataframe of the most frequent words wordClouds[years[i]].words_.items(
df = pd.DataFrame(columns=['Year', 'Word', 'Count'])
for i in years:
# only looking to find stop words in the top 100 words - i don't think I need to look at all the words as i am only displaying the top 10 later/
words = list(sorted(wordClouds[i].words_.items(), key=lambda x: x[1], reverse=True)[:100])
words = [row[0] for row in words]
# #print(words)
wordCount = len(words)
for j in range(0, wordCount):
df = df.append({'Year': i, 'Word': words[j], 'Count': wordCount}, ignore_index=True)
df.head()
df['WordLenght'] = df.Word.str.len() # the lenght of words
df = df[df['WordLenght'] < 3] # remove words with less than 3 characters
display(df.Word, df.WordLenght) # review the words to be removed
0 co
1 j
5 m
10 p
11 e
..
4621 co
4625 td
4672 s
4714 s
4744 t
Name: Word, Length: 273, dtype: object
0 2
1 1
5 1
10 1
11 1
..
4621 2
4625 2
4672 1
4714 1
4744 1
Name: WordLenght, Length: 273, dtype: int64
# and remove stop words
stopWrds = STOPWORDS.update(df.Word)
stopWrds = STOPWORDS.update(['said', 'must', 'joe', 'healy', 'thomas', 'cooney']) # add additional stop words from review of the wordclouds below, the last 4 are names of people
# This is repeated from above but with the updated stop words, I could have created a function but I thought that it was not enough of a saving to justify the effort
wordClouds = {}
for i in years:
wordClouds[i] = WordCloud(stopwords=stopWrds, background_color='white', width=1200, height=1000).generate(documents[i])
At first we used Tkinter to display the word clouds, but this was not very satisfactory, so we used the ipywidgets library to display the word clouds. We could not get the Tkinter window to be part of the notebook, so we used the ipywidgets library to display the word clouds as they worked well in the notebook and do not disctract the reader from the report.
""" # all of this works but is outside of the notebook and seems a bit clunky
# code to create the word cloud images
for y in years:
# Create the word cloud
showWordCloud(y)
# this works but is outside of hte notebook
window = tk.Tk()
window.geometry("1400x1200")
window.title("Word Cloud Slider")
# Create the slider widget
tkInt = (years[len(years) - 1] - years[0]/len(years))
slider = tk.Scale(window, from_=years[0], to=years[len(years) - 1], orient=tk.HORIZONTAL, tickinterval=tkInt, resolution=1, length=500, command=load_image)
label = tk.Label(window, text="Word Cloud of the Year", font=("Arial", 20))
slider.pack()
label.pack()
# Run the tkinter event loop
window.mainloop()
"""
# this is a better implementation of how to view the word clouds
def browse_images(wordClouds):
n = len(wordClouds.items())
def view_image(i):
N = 10
# create 10 word dictionary of the word cloud
srtWords = dict(sorted(wordClouds[years[i]].words_.items(), key=lambda x: x[1], reverse=True)[:N])
# had to use subplots to get the word cloud to display in one row
fig, ax = plt.subplots(figsize=(18,10), nrows=1, ncols=2)
ax[0].imshow(wordClouds[years[i]], interpolation='bilinear')
ax[0].set_title('Word cloud for the year ' + str(years[i]))
ax[0].axis('off')
ax[1].barh(list(srtWords.keys()), list(srtWords.values()), align='edge')
ax[1].set_title('Top ' + str(N) + ' words for the year ' + str(years[i]))
#ax[1].axis('off')
plt.savefig('./Images/WordCloud_' + str(years[i]) + '_.svg')
interact(view_image, i=(0,n-1))
browse_images(wordClouds)
interactive(children=(IntSlider(value=23, description='i', max=47), Output()), _dom_classes=('widget-interact'…
We can see by moving the slider through the years that the top 10 words found in the documentation show a change in the focus of farming concerns. For instance in 1962 silage was the second most common word, then in 1968 soil and soil nutrients are dominating the top 10 words. Then 1969 we see silage, silage making, cattle and liveweight gain all in the top ten, then in 1970 feed has taken over along with calve and calf, suggesting that the focus has shifted to rearing young animals. Through out 1970's, 1980's and up to 2000's, there is a shift of focus onto milk production as evidenced by the words: Milk, Cow, and calving, the animal food requirements were evidenced by the words: silage, grass, dry matter (a key component of silage) and barley. In the 1980's sheep and other ovine words start appearing, but silage and grass are still the most common words, in the year 2002, the word 'Maize' appears in the word cloud, but it does not make the top 10. however it's appearance is interesting as it is an American word for corn, and in the EU it is used as an animal feed. Through out the 2000' silage disappears from the top 10 words, and is replaced by the words: grass, maize, barley, and dry matter, but it stil appears in the word clouds suggesting that it is still a key component of animal feed.
Becasue Will and Farm show up so much there is an arguement to be made that they should be removed from the word cloud, but I will leave them in for now as they show up an interesting trend in the data; that farm succession was a key concern through out the decades and is still a concern today.
# Download beef prices from https://www.ec.europa.eu/agrifood/api/beef/prices?years=1991,2022&months=1,2,3,4,5,6,7,8,9,10,11,12&beginDate=01/01/1991&endDate=31/12/2021
# The range of years is 2009 to 2022
if getData == True:
yr = range(2009,2022)
for y in yr:
getEUData(y)
# display the data from one csv file and check for the values in the category/productname column
df = pd.read_csv('./Data/Raw/EC/pigmeat_2019.csv')
print('Pig Meat data frame is ' + str(df.shape) + ' rows and columns')
display(df.head())
display(df.pigClass.unique())
display(df.info())
#prfile_df(df)
# display the data from one csv file and check for the values in the category/productname/stageName column
df = pd.read_csv('./Data/Raw/EC/cereal_2019.csv')
print('Cereals data frame is ' + str(df.shape) + ' rows and columns')
display(df.head())
# get the unique column values from stageName column
display(df.stageName.unique())
display(df.productName.unique())
display(df.info())
#prfile_df(df)
# display the data from one csv file and check for the values in the category/productname column
df = pd.read_csv('./Data/Raw/EC/beef_2019.csv')
print('Beef Meat data frame is ' + str(df.shape) + ' rows and columns')
display(df.head())
display(df.category.unique())
display(df.info())
#prfile_df(df)
Pig Meat data frame is (847, 8) rows and columns
| memberStateCode | memberStateName | beginDate | endDate | weekNumber | price | unit | pigClass | |
|---|---|---|---|---|---|---|---|---|
| 0 | AT | Austria | 02/12/2019 | 08/12/2019 | 49 | €216,11 | 100 KG | S |
| 1 | AT | Austria | 02/12/2019 | 08/12/2019 | 49 | €204,93 | 100 KG | E |
| 2 | AT | Austria | 02/12/2019 | 08/12/2019 | 49 | €212,51 | 100 KG | Average S + E |
| 3 | BE | Belgium | 02/12/2019 | 08/12/2019 | 49 | €68,50 | P | Piglet |
| 4 | BE | Belgium | 02/12/2019 | 08/12/2019 | 49 | €172,80 | 100 KG | E |
array(['S', 'E', 'Average S + E', 'Piglet', 'R'], dtype=object)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 847 entries, 0 to 846 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 memberStateCode 847 non-null object 1 memberStateName 847 non-null object 2 beginDate 847 non-null object 3 endDate 847 non-null object 4 weekNumber 847 non-null int64 5 price 847 non-null object 6 unit 847 non-null object 7 pigClass 847 non-null object dtypes: int64(1), object(7) memory usage: 53.1+ KB
None
Cereals data frame is (590, 10) rows and columns
| memberStateCode | memberStateName | beginDate | endDate | weekNumber | price | unit | productName | marketName | stageName | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AT | Austria | 16/12/2019 | 22/12/2019 | 25 | €140,00 | TONNES | Maize | Wien | Departure from silo - after some storage - on ... |
| 1 | AT | Austria | 16/12/2019 | 22/12/2019 | 25 | €139,00 | TONNES | Feed barley | Wien | Departure from silo - after some storage - on ... |
| 2 | BE | Belgium | 16/12/2019 | 22/12/2019 | 25 | €184,00 | TONNES | Maize | Brussel - Bruxelles | Departure from farm or from production area - ... |
| 3 | BG | Bulgaria | 16/12/2019 | 22/12/2019 | 25 | €168,73 | TONNES | Milling wheat | Dobrich | Departure from farm or from production area - ... |
| 4 | BG | Bulgaria | 16/12/2019 | 22/12/2019 | 25 | €158,50 | TONNES | Feed wheat | Dobrich | Departure from farm or from production area - ... |
array(['Departure from silo - after some storage - on truck or other transport means',
'Departure from farm or from production area - on truck or other transport means',
'Deliver to first customer - silo or processing plant - on truck or other transport means',
'Delivered to port - grain delivered to a port silo by train or truck or barge',
'Cost, Insurance and Freight - Incoterm',
'Shipped to the place (port), unloaded, on truck leaving port',
'Free On Board - Incoterm', 'Price at farm gate'], dtype=object)
array(['Maize', 'Feed barley', 'Milling wheat', 'Feed wheat', 'Feed oats',
'Malting barley', 'Milling rye', 'Durum wheat', 'Feed rye'],
dtype=object)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 590 entries, 0 to 589 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 memberStateCode 590 non-null object 1 memberStateName 590 non-null object 2 beginDate 590 non-null object 3 endDate 590 non-null object 4 weekNumber 590 non-null int64 5 price 590 non-null object 6 unit 590 non-null object 7 productName 590 non-null object 8 marketName 590 non-null object 9 stageName 590 non-null object dtypes: int64(1), object(9) memory usage: 46.2+ KB
None
Beef Meat data frame is (5849, 9) rows and columns
| memberStateCode | memberStateName | beginDate | endDate | weekNumber | price | unit | category | productCode | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | AT | Austria | 02/12/2019 | 08/12/2019 | 49 | €201.66 | €/100Kg | Cows | D P2 |
| 1 | AT | Austria | 02/12/2019 | 08/12/2019 | 49 | €240.18 | €/100Kg | Cows | D O3 |
| 2 | AT | Austria | 02/12/2019 | 08/12/2019 | 49 | €328.36 | €/100Kg | Bulls | B R3 |
| 3 | AT | Austria | 02/12/2019 | 08/12/2019 | 49 | €236.01 | €/100Kg | Cows | D O2 |
| 4 | AT | Austria | 02/12/2019 | 08/12/2019 | 49 | €335.06 | €/100Kg | Young cattle | Z O2 |
array(['Cows', 'Bulls', 'Young cattle', 'Steers', 'Young bulls',
'Heifers', 'Adult male indicative price', 'Calves slaughtered <8M'],
dtype=object)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5849 entries, 0 to 5848 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 memberStateCode 5849 non-null object 1 memberStateName 5849 non-null object 2 beginDate 5849 non-null object 3 endDate 5849 non-null object 4 weekNumber 5849 non-null int64 5 price 5849 non-null object 6 unit 5849 non-null object 7 category 5849 non-null object 8 productCode 5849 non-null object dtypes: int64(1), object(8) memory usage: 411.4+ KB
None
The project is seeking to predict meat prices based on the price of animal feed. To this we will create a dataset with 2 target variables, beef price for an 'Adult male indicative price' and the price of 'Average S + E' for pig meat, the predictor variables will be the prices of 'Feed wheat' and 'Feed barley'. There are two targets one for beef and one for pig meat, it was not possible to find prices for other fodder products such as hay, haylage or silage. The only site that I could find was https://www.clal.it and it did not have data that could be downloaded programmatically.
The data was downloaded from https://www.ec.europa.eu/agrifood/api/ and the data was cleaned and prepared for analysis. The data was then analysed to determine if there was a relationship between the price of animal feed and the price of animals.
Below we take a look at the raw data, to see what we are working with, and to determine which columns we will use for our analysis from each set, and which columns are common to all sets. These columns are as follows:
We will drop the following columns from the data sets:
# data cleanup and preparation
# load the files names and filter them to only include the files that are required
files = os.listdir('./Data/Raw/EC/')
beef_files = filter(lambda x: x.startswith('beef'), files)
cereal_files = filter(lambda x: x.startswith('cereal'), files)
pig_files = filter(lambda x: x.startswith('pig'), files)
# because the data are slightly different structures, I have to handle them separately before joining them together
# Create the beef data frame
df_beef = pd.concat(load_Files('./Data/Raw/EC/', beef_files))
# filter the data to only include the columns that are required
df_beef = df_beef[df_beef['category'] == 'Adult male indicative price']
df_beef = df_beef[['memberStateCode', 'beginDate', 'endDate', 'weekNumber', 'price']]
df_beef['beginDate'] = pd.to_datetime(df_beef['beginDate'], format='%d/%m/%Y')
df_beef['endDate'] = pd.to_datetime(df_beef['endDate'], format='%d/%m/%Y')
# remove the euro sign and convert the price to a float
df_beef['price'] = df_beef['price'].replace({'\€': '', ',' : ''}, regex=True).astype(float)
df_beef = df_beef.rename(columns={'price': 'beef_price', 'category': 'productName'})
# Create the cereal data frame
df_cereal = pd.concat(load_Files('./Data/Raw/EC/', cereal_files))
df_cereal_wheat = df_cereal[df_cereal['productName'] == 'Feed wheat']
df_cereal_wheat = df_cereal_wheat.rename(columns={'price': 'wheat_price'})
df_cereal_barley = df_cereal[df_cereal['productName'] == 'Feed barley']
df_cereal_barley = df_cereal_barley.rename(columns={'price': 'barley_price'})
df_cereal_oats = df_cereal[df_cereal['productName'] == 'Feed oats']
df_cereal_oats = df_cereal_oats.rename(columns={'price': 'oats_price'})
df_cereal_rye = df_cereal[df_cereal['productName'] == 'Feed rye']
df_cereal_rye = df_cereal_rye.rename(columns={'price': 'rye_price'})
# merge the wheat and barley dataframes
df_cereal = pd.merge(df_cereal_wheat, df_cereal_barley, how = 'outer', on=['memberStateCode', 'beginDate', 'endDate', 'weekNumber'])
df_cereal = pd.merge(df_cereal, df_cereal_oats, how = 'outer', on=['memberStateCode', 'beginDate', 'endDate', 'weekNumber'])
df_cereal = pd.merge(df_cereal, df_cereal_rye, how = 'outer', on=['memberStateCode', 'beginDate', 'endDate', 'weekNumber'])
df_cereal = df_cereal[['memberStateCode', 'beginDate', 'endDate', 'weekNumber', 'wheat_price', 'barley_price', 'oats_price', 'rye_price']]
df_cereal['beginDate'] = pd.to_datetime(df_cereal['beginDate'], format='%d/%m/%Y')
df_cereal['endDate'] = pd.to_datetime(df_cereal['endDate'], format='%d/%m/%Y')
# remove the euro sign and convert the price to a float - note that the price contains a comma
df_cereal['wheat_price'] = df_cereal['wheat_price'].replace({'\€': '', ',': '.'}, regex=True).astype(float)
df_cereal['barley_price'] = df_cereal['barley_price'].replace({'\€': '', ',': '.'}, regex=True).astype(float)
df_cereal['oats_price'] = df_cereal['oats_price'].replace({'\€': '', ',': '.'}, regex=True).astype(float)
df_cereal['rye_price'] = df_cereal['rye_price'].replace({'\€': '', ',': '.'}, regex=True).astype(float)
display(df_cereal.head())
df_pig = pd.concat(load_Files('./Data/Raw/EC/', pig_files))
df_pig = df_pig[df_pig['pigClass'] == 'Average S + E']
df_pig = df_pig[['memberStateCode', 'beginDate', 'endDate', 'weekNumber', 'price']]
df_pig['beginDate'] = pd.to_datetime(df_pig['beginDate'], format='%d/%m/%Y')
df_pig['endDate'] = pd.to_datetime(df_pig['endDate'], format='%d/%m/%Y')
# remove the euro sign and convert the price to a float - note that the price contains a comma
df_pig['price'] = df_pig['price'].replace({'\€': '', ',' : '.'}, regex=True).astype(float)
df_pig = df_pig.rename(columns={'price': 'pig_price', 'pigClass': 'productName'})
| memberStateCode | beginDate | endDate | weekNumber | wheat_price | barley_price | oats_price | rye_price | |
|---|---|---|---|---|---|---|---|---|
| 0 | BG | 2009-12-14 | 2009-12-20 | 25 | 97.15 | 92.03 | NaN | NaN |
| 1 | RO | 2009-12-14 | 2009-12-20 | 25 | 101.91 | 82.95 | NaN | NaN |
| 2 | BG | 2010-12-13 | 2010-12-19 | 25 | 186.62 | NaN | NaN | NaN |
| 3 | BG | 2010-11-01 | 2010-11-07 | 19 | 176.40 | NaN | NaN | NaN |
| 4 | RO | 2010-11-01 | 2010-11-07 | 19 | 177.03 | 104.11 | NaN | NaN |
# missing values in the cereal data frame due to oats and rye not being available for some countries
# fill the missing values using scikit learn KNN feature imputation https://scikit-learn.org/stable/modules/impute.html
display(df_cereal.isnull().sum())
df_cereal['year'] = df_cereal['beginDate'].dt.year
from sklearn.impute import KNNImputer
NaN = np.nan
df_cereal_imp = df_cereal[['year','weekNumber','wheat_price', 'barley_price', 'oats_price', 'rye_price']]
imputer = KNNImputer(n_neighbors=4, weights="uniform")
df_cereal_imputed = imputer.fit_transform(df_cereal_imp)
df_cereal_imputed = pd.DataFrame(df_cereal_imputed, columns=['year','weekNumber','wheat_price', 'barley_price', 'oats_price', 'rye_price'])
df_cereal = df_cereal.merge(df_cereal_imputed, how='left', left_index=True, right_index=True)
# drop the extra columns with NA
df_cereal = df_cereal.drop(['year_y', 'weekNumber_y', 'wheat_price_x', 'barley_price_x', 'oats_price_x', 'rye_price_x'], axis=1)
# rename the columns
df_cereal = df_cereal.rename(columns={'year_x': 'year', 'weekNumber_x': 'weekNumber', 'wheat_price_y': 'wheat_price', 'barley_price_y': 'barley_price', 'oats_price_y': 'oats_price', 'rye_price_y': 'rye_price'})
display(df_cereal.isnull().sum())
memberStateCode 0 beginDate 0 endDate 0 weekNumber 0 wheat_price 422 barley_price 113 oats_price 1251 rye_price 1237 dtype: int64
memberStateCode 0 beginDate 0 endDate 0 weekNumber 0 year 0 wheat_price 0 barley_price 0 oats_price 0 rye_price 0 dtype: int64
The Temp_df contains the output of a pandas description function. The missing_ df contains the output of function which counts missing values in a column.
# join the dataframes together
df = df_beef.merge(df_cereal, how='left', on=['memberStateCode', 'weekNumber'])
df = df.merge(df_pig, how='left', on=['memberStateCode', 'weekNumber'])
temp_df = df.describe().T
missing_df = pd.DataFrame(df.apply(num_missing, axis=0))
missing_df.columns = ['missing'] # type: ignore
display(temp_df.style)
display(missing_df.style)
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| weekNumber | 212838.000000 | 26.406788 | 14.483621 | 1.000000 | 13.000000 | 25.000000 | 37.000000 | 49.000000 |
| beef_price | 212838.000000 | 333.727669 | 56.148034 | 143.470000 | 299.530000 | 335.090000 | 373.160000 | 512.320000 |
| year | 202701.000000 | 2016.918155 | 2.534274 | 2009.000000 | 2015.000000 | 2017.000000 | 2019.000000 | 2021.000000 |
| wheat_price | 202701.000000 | 167.213238 | 29.089562 | 97.150000 | 147.385000 | 162.930000 | 182.960000 | 308.000000 |
| barley_price | 202701.000000 | 161.523086 | 30.742933 | 82.950000 | 140.000000 | 157.460000 | 178.000000 | 309.000000 |
| oats_price | 202701.000000 | 151.878114 | 27.372559 | 73.410000 | 131.500000 | 147.092500 | 167.750000 | 287.000000 |
| rye_price | 202701.000000 | 136.408155 | 22.446876 | 71.140000 | 122.200000 | 131.125000 | 147.910000 | 230.230000 |
| pig_price | 212838.000000 | 157.195957 | 19.986996 | 94.110000 | 143.500000 | 156.190000 | 170.400000 | 237.000000 |
| missing | |
|---|---|
| memberStateCode | 0 |
| beginDate_x | 0 |
| endDate_x | 0 |
| weekNumber | 0 |
| beef_price | 0 |
| beginDate_y | 10137 |
| endDate_y | 10137 |
| year | 10137 |
| wheat_price | 10137 |
| barley_price | 10137 |
| oats_price | 10137 |
| rye_price | 10137 |
| beginDate | 0 |
| endDate | 0 |
| pig_price | 0 |
As we can see there is an issue with the dates beginDate and endDate, they do not match the weekNumber in the different data sets. This is a problem as we will need to merge the data sets on the weekNumber column.
The solution is to create year column from the beginDate column, and then drop the beginDate and endDate columns.
We can see that there are missing values in the data set, but this time they are in the price column and year_y. This is because cereal data set does not have the same amount of data for each year, the solution is to use the cereal data as the first merge.
df_beef['year'] = df_beef['beginDate'].dt.year
df_beef = df_beef.drop(['beginDate', 'endDate'], axis=1)
df_pig['year'] = df_pig['beginDate'].dt.year
df_pig = df_pig.drop(['beginDate', 'endDate'], axis=1)
df_cereal = df_cereal.drop(['beginDate', 'endDate'], axis=1)
Having fixed the dates we can now merge the data sets on the weekNumber column.
# join the dataframes together
df = df_cereal.merge(df_pig, how='left', on=['memberStateCode', 'year', 'weekNumber'])
df = df.merge(df_beef, how='left', on=['memberStateCode', 'year', 'weekNumber'])
temp_df = df.describe().T
missing_df = pd.DataFrame(df.apply(num_missing, axis=0))
missing_df.columns = ['missing'] # type: ignore
display(temp_df.style)
display(missing_df.style)
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| weekNumber | 1493.000000 | 24.292699 | 15.501732 | 1.000000 | 13.000000 | 25.000000 | 37.000000 | 49.000000 |
| year | 1493.000000 | 2016.828533 | 2.605689 | 2009.000000 | 2015.000000 | 2017.000000 | 2019.000000 | 2021.000000 |
| wheat_price | 1493.000000 | 166.594029 | 28.200920 | 97.150000 | 147.400000 | 162.460000 | 181.940000 | 308.000000 |
| barley_price | 1493.000000 | 161.451159 | 30.756818 | 82.950000 | 139.560000 | 157.300000 | 178.400000 | 309.000000 |
| oats_price | 1493.000000 | 151.957585 | 26.978625 | 73.410000 | 132.500000 | 147.517500 | 167.745000 | 287.000000 |
| rye_price | 1493.000000 | 136.228257 | 22.585114 | 71.140000 | 121.890000 | 131.027500 | 148.410000 | 230.230000 |
| pig_price | 1353.000000 | 158.662313 | 21.038166 | 101.910000 | 143.660000 | 159.250000 | 173.570000 | 225.920000 |
| beef_price | 1275.000000 | 344.201702 | 57.350059 | 143.470000 | 308.700000 | 349.690000 | 380.385000 | 512.320000 |
| missing | |
|---|---|
| memberStateCode | 0 |
| weekNumber | 0 |
| year | 0 |
| wheat_price | 0 |
| barley_price | 0 |
| oats_price | 0 |
| rye_price | 0 |
| pig_price | 140 |
| beef_price | 218 |
# remove the rows with missing values
df = df.dropna()
df = df.sort_values(by=['year', 'weekNumber', 'memberStateCode'])
df = df.reset_index(drop=True)
temp_df = df.describe().T
missing_df = pd.DataFrame(df.apply(num_missing, axis=0))
missing_df.columns = ['missing'] # type: ignore
display(temp_df.style)
display(missing_df.style)
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| weekNumber | 1241.000000 | 25.715552 | 14.854471 | 1.000000 | 13.000000 | 25.000000 | 37.000000 | 49.000000 |
| year | 1241.000000 | 2016.848509 | 2.536749 | 2009.000000 | 2015.000000 | 2017.000000 | 2019.000000 | 2021.000000 |
| wheat_price | 1241.000000 | 166.657168 | 29.143361 | 101.910000 | 146.935000 | 162.080000 | 182.000000 | 308.000000 |
| barley_price | 1241.000000 | 160.379903 | 30.616877 | 82.950000 | 139.000000 | 156.290000 | 177.000000 | 309.000000 |
| oats_price | 1241.000000 | 151.400858 | 27.180133 | 73.410000 | 131.337500 | 146.667500 | 167.260000 | 287.000000 |
| rye_price | 1241.000000 | 135.776459 | 22.571790 | 71.140000 | 121.937500 | 130.597500 | 147.280000 | 230.230000 |
| pig_price | 1241.000000 | 156.789742 | 19.997224 | 101.910000 | 142.490000 | 157.210000 | 171.210000 | 216.800000 |
| beef_price | 1241.000000 | 342.864803 | 57.508153 | 143.470000 | 306.290000 | 347.730000 | 379.160000 | 512.320000 |
| missing | |
|---|---|
| memberStateCode | 0 |
| weekNumber | 0 |
| year | 0 |
| wheat_price | 0 |
| barley_price | 0 |
| oats_price | 0 |
| rye_price | 0 |
| pig_price | 0 |
| beef_price | 0 |
Discuss in detail the process of acquiring your raw data, detailing the positive and/or negative aspects of your research and acquisition. This should include the relevance and implications of any and all licensing/permissions associated with the data. [0-15]
Data was sourced from a number of sources including the https://www.irishgrassland.ie/journals/ site, the https://www.teagasc.ie/ site, and the https://www.teagasc.ie/ site. The data was gathered from the pdf files on the sites and converted to csv files using the tabula-py package. The data was then cleaned and merged into a single dataset. The data was then analysed using the pandas and matplotlib packages.
Exploratory Data Analysis helps to identify patterns, inconsistencies, anomalies, missing data, and other attributes and issues in data sets so problems can be addressed.
Evaluate your raw data and detail, in depth, the various attributes and issues that you find. Your evaluation should reference evidence to support your chosen methodology and use visualizations to illustrate your findings.[0-25]
Display the shape and types of data
unq_df = pd.DataFrame(df.apply(num_unique, axis=0))
unq_df.columns = ['unique'] # type: ignore
types_df = pd.DataFrame(df.dtypes)
types_df.columns = ['DataType'] # type: ignore
summary_df = temp_df.join(missing_df).join(unq_df).join(types_df)
display(summary_df.style) # Numerical Data
| count | mean | std | min | 25% | 50% | 75% | max | missing | unique | DataType | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| weekNumber | 1241.000000 | 25.715552 | 14.854471 | 1.000000 | 13.000000 | 25.000000 | 37.000000 | 49.000000 | 0 | 9 | int64 |
| year | 1241.000000 | 2016.848509 | 2.536749 | 2009.000000 | 2015.000000 | 2017.000000 | 2019.000000 | 2021.000000 | 0 | 13 | int64 |
| wheat_price | 1241.000000 | 166.657168 | 29.143361 | 101.910000 | 146.935000 | 162.080000 | 182.000000 | 308.000000 | 0 | 914 | float64 |
| barley_price | 1241.000000 | 160.379903 | 30.616877 | 82.950000 | 139.000000 | 156.290000 | 177.000000 | 309.000000 | 0 | 703 | float64 |
| oats_price | 1241.000000 | 151.400858 | 27.180133 | 73.410000 | 131.337500 | 146.667500 | 167.260000 | 287.000000 | 0 | 746 | float64 |
| rye_price | 1241.000000 | 135.776459 | 22.571790 | 71.140000 | 121.937500 | 130.597500 | 147.280000 | 230.230000 | 0 | 735 | float64 |
| pig_price | 1241.000000 | 156.789742 | 19.997224 | 101.910000 | 142.490000 | 157.210000 | 171.210000 | 216.800000 | 0 | 931 | float64 |
| beef_price | 1241.000000 | 342.864803 | 57.508153 | 143.470000 | 306.290000 | 347.730000 | 379.160000 | 512.320000 | 0 | 995 | float64 |
Describe the numerical data
Use descriptive statistics and appropriate visualisations in order to summarise the dataset(s) used, and to help justify the chosen models. [0-20]
c_df = df.corr('pearson')
ffig, ax = plt.subplots(figsize=(15,12))
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1, rc={"lines.linewidth": 2.5})
sns.heatmap(c_df, cmap="YlGnBu", annot=True)
plt.savefig('./Images/Img_2.2_MergedCorHeatmap.svg')
plt.show()
sns.pairplot(df);
plt.savefig('./Images/Img_2.2_pairplot.svg')
df.hist(figsize=(15,12));
plt.savefig('./Images/Img_2.2_price_histogram.svg')
Set the Numerical and Categorical columns
Analyse the variables in your dataset(s) and use appropriate inferential statistics to gain insights on possible population values
(e.g., if you were working with international commerce, you could find a confidence interval for the population proportion of yearly dairy exports out of all agricultural exports). [0-20]
CategoricalColumns = ['memberStateCode']
NumericalColumns = ['year', 'weekNumber', 'barley_price', 'wheat_price', 'beef_price', 'pig_price', 'oats_price', 'rye_price']
col_names = list(types_df.index)
num_cols = len(col_names)
index = range(num_cols)
cat_index = []
for i in index:
if col_names[i] in CategoricalColumns:
cat_index.append(i)
summary_df_cat = missing_df.join(unq_df).join(types_df.iloc[cat_index], how='inner') #Only summarize categorical columns
print(summary_df_cat.style)
<pandas.io.formats.style.Styler object at 0x0000020F6B8C5E80>
Display plots from the Data set, including Histograms, pairplots, boxplots and correlation
Undertake research to find similarities between some country(s) against Ireland, and apply parametric and non-parametric inferential statistical techniques to compare them
(e.g., t-test, analysis of variance, Wilcoxon test, chi-squared test, among others).
You must justify your choices and verify the applicability of the tests. Hypotheses and conclusions must be clearly stated.
You are expected to use at least 5 different inferential statistics tests. [0-40]
sns.set(color_codes=True)
sns.catplot(data=df, x='memberStateCode', y = 'beef_price' , kind='box', height=10, aspect=2, order = sorted(df['memberStateCode'].unique()));
plt.savefig('./Images/Img_2.2_price_beef_price.svg')
status, color, p_val = shapiro_test(df['beef_price'])
print('Normality test for {} {} (p_value = {})'.format("Beef Price", status, p_val))
Normality test for Beef Price failed (p_value = 3.0960516141931294e-06)
The distributions do not look similar, we can verify this by testing to see if they are from the same population, using either Anova or a non parametric test such as Kruskal-Wallis
para, nonpara = get_parametric(df, 'beef_price')
# we have a parametric and non parametric datasets
display(para['memberStateCode'].unique())
Member State: AT Shapiro Test: failed p-value: 0.007924087345600128 Member State: BE Shapiro Test: failed p-value: 2.3153206711867824e-05 Member State: BG Shapiro Test: passed p-value: 0.11971638351678848 Member State: CZ Shapiro Test: passed p-value: 0.8151438236236572 Member State: DE Shapiro Test: failed p-value: 0.0019766376353800297 Member State: EL Shapiro Test: passed p-value: 0.3795616626739502 Member State: ES Shapiro Test: failed p-value: 0.01968962326645851 Member State: FI Shapiro Test: passed p-value: 0.05200904235243797 Member State: FR Shapiro Test: failed p-value: 0.0015571796102449298 Member State: HU Shapiro Test: passed p-value: 0.10274165868759155 Member State: IE Shapiro Test: passed p-value: 0.09984482824802399 Member State: IT Shapiro Test: passed p-value: 0.22455312311649323 Member State: LT Shapiro Test: failed p-value: 0.00020869039872195572 Member State: LV Shapiro Test: passed p-value: 0.5952993631362915 Member State: NL Shapiro Test: failed p-value: 0.021180488169193268 Member State: PL Shapiro Test: failed p-value: 9.95198385607746e-16 Member State: PT Shapiro Test: failed p-value: 0.0033160827588289976 Member State: RO Shapiro Test: failed p-value: 0.009294197894632816 Member State: SK Shapiro Test: failed p-value: 0.038040466606616974 Member State: UK Shapiro Test: failed p-value: 6.38799929220113e-06
array(['BG', 'CZ', 'EL', 'FI', 'HU', 'IE', 'IT', 'LV'], dtype=object)
# before proceeding with the parametric test, we need to check for homogeneity of variance
# we will use the Levene test
para = get_levene(para, 'beef_price')
display(para['memberStateCode'].unique())
Member State: BG Levene Test: False Member State: CZ Levene Test: False Member State: EL Levene Test: False Member State: FI Levene Test: False Member State: HU Levene Test: False Member State: IT Levene Test: False Member State: LV Levene Test: True
array(['LV', 'IE'], dtype=object)
aov = get_aov(para, 'beef_price')
Significant difference between Ireland and LV
sum_sq df F PR(>F)
memberStateCode 686447.139984 1.0 1188.743899 1.806799e-62
Residual 66407.424802 115.0 NaN NaN
sns.set(color_codes=True)
sns.catplot(data=df, x='memberStateCode', y = 'pig_price' , kind='box', height=10, aspect=2, order = sorted(df['memberStateCode'].unique()));
plt.savefig('./Images/Img_2.2_memberstate_pig_price.svg')
para, nonpara = get_parametric(df, 'pig_price')
# we have a parametric and non parametric datasets
display(para['memberStateCode'].unique())
Member State: AT Shapiro Test: passed p-value: 0.7720921039581299 Member State: BE Shapiro Test: passed p-value: 0.27508509159088135 Member State: BG Shapiro Test: passed p-value: 0.7106081247329712 Member State: CZ Shapiro Test: passed p-value: 0.273192822933197 Member State: DE Shapiro Test: passed p-value: 0.3021285831928253 Member State: EL Shapiro Test: failed p-value: 0.023727376013994217 Member State: ES Shapiro Test: failed p-value: 0.008574129082262516 Member State: FI Shapiro Test: passed p-value: 0.3151337206363678 Member State: FR Shapiro Test: passed p-value: 0.1438722014427185 Member State: HU Shapiro Test: passed p-value: 0.271283358335495 Member State: IE Shapiro Test: failed p-value: 0.017174778506159782 Member State: IT Shapiro Test: passed p-value: 0.21749120950698853 Member State: LT Shapiro Test: passed p-value: 0.8144707083702087 Member State: LV Shapiro Test: passed p-value: 0.10687637329101562 Member State: NL Shapiro Test: failed p-value: 0.005535007920116186 Member State: PL Shapiro Test: failed p-value: 0.002831840654835105 Member State: PT Shapiro Test: passed p-value: 0.1994149535894394 Member State: RO Shapiro Test: passed p-value: 0.1809341460466385 Member State: SK Shapiro Test: passed p-value: 0.7133941650390625 Member State: UK Shapiro Test: failed p-value: 0.000973705668002367
array(['AT', 'BE', 'BG', 'CZ', 'DE', 'FI', 'FR', 'HU', 'IT', 'LT', 'LV',
'PT', 'RO', 'SK'], dtype=object)
# before proceeding with the parametric test, we need to check for homogeneity of variance
# we will use the Levene test
para = get_levene(para, 'pig_price')
#display(para['memberStateCode'].unique())
Member State: AT Levene Test: False Member State: BE Levene Test: False Member State: BG Levene Test: False Member State: CZ Levene Test: False Member State: DE Levene Test: False Member State: FI Levene Test: False Member State: FR Levene Test: False Member State: HU Levene Test: False Member State: IT Levene Test: False Member State: LT Levene Test: False Member State: LV Levene Test: False Member State: PT Levene Test: False Member State: RO Levene Test: False Member State: SK Levene Test: False
C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount)
Levene test false for all member states, nothing is similiar to Ireland
sns.set(color_codes=True)
sns.catplot(data=df, x='memberStateCode', y = 'barley_price' , kind='box', height=10, aspect=2, order = sorted(df['memberStateCode'].unique()));
plt.savefig('./Images/Img_2.2_memberStateCode_barley_price.svg')
para, nonpara = get_parametric(df, 'barley_price')
# we have a parametric and non parametric datasets
display(para['memberStateCode'].unique())
Member State: AT Shapiro Test: failed p-value: 9.171759529635892e-07 Member State: BE Shapiro Test: failed p-value: 0.001238599419593811 Member State: BG Shapiro Test: passed p-value: 0.5139174461364746 Member State: CZ Shapiro Test: passed p-value: 0.3972421884536743 Member State: DE Shapiro Test: failed p-value: 9.288976798416115e-06 Member State: EL Shapiro Test: failed p-value: 0.006459851749241352 Member State: ES Shapiro Test: failed p-value: 3.9980921351379095e-10 Member State: FI Shapiro Test: failed p-value: 2.0793664012863644e-10 Member State: FR Shapiro Test: failed p-value: 0.0005691362894140184 Member State: HU Shapiro Test: passed p-value: 0.1997748613357544 Member State: IE Shapiro Test: failed p-value: 5.448334377433639e-06 Member State: IT Shapiro Test: passed p-value: 0.7186490297317505 Member State: LT Shapiro Test: failed p-value: 0.00018229868146590889 Member State: LV Shapiro Test: passed p-value: 0.11180797964334488 Member State: NL Shapiro Test: failed p-value: 5.383051870921918e-07 Member State: PL Shapiro Test: failed p-value: 3.338349756631942e-07 Member State: PT Shapiro Test: failed p-value: 2.253047114209039e-06 Member State: RO Shapiro Test: failed p-value: 0.04225873947143555 Member State: SK Shapiro Test: failed p-value: 7.894093141658232e-05 Member State: UK Shapiro Test: failed p-value: 0.0007186824805103242
array(['BG', 'CZ', 'HU', 'IT', 'LV'], dtype=object)
# before proceeding with the parametric test, we need to check for homogeneity of variance
# we will use the Levene test
para = get_levene(para, 'barley_price')
display(para['memberStateCode'].unique())
Member State: BG Levene Test: False Member State: CZ Levene Test: False Member State: HU Levene Test: False Member State: IT Levene Test: False Member State: LV Levene Test: False
C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount) C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice. return _methods._mean(a, axis=axis, dtype=dtype, C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\numpy\core\_methods.py:190: RuntimeWarning: invalid value encountered in double_scalars ret = ret.dtype.type(ret / rcount)
array(['LV'], dtype=object)
No member state passed the Levene test
sns.set(color_codes=True)
sns.catplot(data=df, x='memberStateCode', y = 'wheat_price' , kind='box', height=10, aspect=2, order = sorted(df['memberStateCode'].unique()));
plt.savefig('./Images/Img_2.2_memberStateCode_wheat_price.svg')
para, nonpara = get_parametric(df, 'wheat_price')
# we have a parametric and non parametric datasets
display(para['memberStateCode'].unique())
Member State: AT Shapiro Test: failed p-value: 1.2426149851307855e-06 Member State: BE Shapiro Test: failed p-value: 0.0028860021848231554 Member State: BG Shapiro Test: passed p-value: 0.43810340762138367 Member State: CZ Shapiro Test: failed p-value: 0.0055374689400196075 Member State: DE Shapiro Test: failed p-value: 2.603988286864478e-05 Member State: EL Shapiro Test: passed p-value: 0.1413494497537613 Member State: ES Shapiro Test: failed p-value: 0.03423668071627617 Member State: FI Shapiro Test: failed p-value: 0.0001745796180330217 Member State: FR Shapiro Test: passed p-value: 0.13887517154216766 Member State: HU Shapiro Test: passed p-value: 0.08363591879606247 Member State: IE Shapiro Test: failed p-value: 2.4236016543000005e-05 Member State: IT Shapiro Test: passed p-value: 0.6737550497055054 Member State: LT Shapiro Test: passed p-value: 0.25959512591362 Member State: LV Shapiro Test: passed p-value: 0.14130589365959167 Member State: NL Shapiro Test: failed p-value: 1.4000631381350104e-05 Member State: PL Shapiro Test: failed p-value: 2.1739029421041778e-07 Member State: PT Shapiro Test: failed p-value: 6.099916845414555e-06 Member State: RO Shapiro Test: failed p-value: 0.0007016176823526621 Member State: SK Shapiro Test: failed p-value: 0.0006710886373184621 Member State: UK Shapiro Test: failed p-value: 0.0057765040546655655
array(['BG', 'EL', 'FR', 'HU', 'IT', 'LT', 'LV'], dtype=object)
Irish data is not normal, so we are not able to go further with the Anova test, we will use the Kruskal-Wallis test instead. (not coded due running out of time.)
sns.set(color_codes=True)
sns.catplot(data=df, x='memberStateCode', y = 'oats_price' , kind='box', height=10, aspect=2, order = sorted(df['memberStateCode'].unique()));
plt.savefig('./Images/Img_2.2_memberStateCode_oats_price.svg')
para, nonpara = get_parametric(df, 'oats_price')
# we have a parametric and non parametric datasets
display(para['memberStateCode'].unique())
Member State: AT Shapiro Test: failed p-value: 1.2308659336213168e-07 Member State: BE Shapiro Test: failed p-value: 0.006417360156774521 Member State: BG Shapiro Test: passed p-value: 0.7382279634475708 Member State: CZ Shapiro Test: passed p-value: 0.5673552751541138 Member State: DE Shapiro Test: failed p-value: 0.026611097157001495 Member State: EL Shapiro Test: passed p-value: 0.07224301248788834 Member State: ES Shapiro Test: passed p-value: 0.2003484070301056 Member State: FI Shapiro Test: failed p-value: 1.464450921639937e-10 Member State: FR Shapiro Test: failed p-value: 0.004469607025384903 Member State: HU Shapiro Test: passed p-value: 0.36896786093711853 Member State: IE Shapiro Test: failed p-value: 0.0005318887415342033 Member State: IT Shapiro Test: passed p-value: 0.8043407797813416 Member State: LT Shapiro Test: failed p-value: 0.003087714547291398 Member State: LV Shapiro Test: failed p-value: 7.324905163841322e-05 Member State: NL Shapiro Test: failed p-value: 0.004334065597504377 Member State: PL Shapiro Test: failed p-value: 0.023839857429265976 Member State: PT Shapiro Test: passed p-value: 0.10996796190738678 Member State: RO Shapiro Test: failed p-value: 2.7131763999932446e-06 Member State: SK Shapiro Test: failed p-value: 1.2097563057977823e-06 Member State: UK Shapiro Test: failed p-value: 0.006220292299985886
array(['BG', 'CZ', 'EL', 'ES', 'HU', 'IT', 'PT'], dtype=object)
Irish data is not normal, so we are not able to go further with the Anova test, we will use the Kruskal-Wallis test instead. (not coded due running out of time.)
sns.set(color_codes=True)
sns.catplot(data=df, x='memberStateCode', y = 'rye_price' , kind='box', height=10, aspect=2, order = sorted(df['memberStateCode'].unique()));
plt.savefig('./Images/Img_2.2_memberStateCode_rye_price.svg')
para, nonpara = get_parametric(df, 'rye_price')
# we have a parametric and non parametric datasets
display(para['memberStateCode'].unique())
Member State: AT Shapiro Test: failed p-value: 4.69068945676554e-05 Member State: BE Shapiro Test: failed p-value: 0.002135208109393716 Member State: BG Shapiro Test: passed p-value: 0.338758647441864 Member State: CZ Shapiro Test: passed p-value: 0.1593373417854309 Member State: DE Shapiro Test: failed p-value: 0.0015077561838552356 Member State: EL Shapiro Test: failed p-value: 0.046810995787382126 Member State: ES Shapiro Test: failed p-value: 0.0024063135497272015 Member State: FI Shapiro Test: failed p-value: 6.300207644471811e-08 Member State: FR Shapiro Test: failed p-value: 7.307748455787078e-05 Member State: HU Shapiro Test: passed p-value: 0.054759394377470016 Member State: IE Shapiro Test: failed p-value: 0.00016448770475108176 Member State: IT Shapiro Test: passed p-value: 0.25784897804260254 Member State: LT Shapiro Test: passed p-value: 0.26637279987335205 Member State: LV Shapiro Test: failed p-value: 0.002487144200131297 Member State: NL Shapiro Test: failed p-value: 0.00025098409969359636 Member State: PL Shapiro Test: failed p-value: 5.12535361374411e-11 Member State: PT Shapiro Test: failed p-value: 0.026980988681316376 Member State: RO Shapiro Test: failed p-value: 7.130643643904477e-05 Member State: SK Shapiro Test: passed p-value: 0.052963707596063614 Member State: UK Shapiro Test: failed p-value: 0.00011726957745850086
array(['BG', 'CZ', 'HU', 'IT', 'LT', 'SK'], dtype=object)
Irish data is not normal, so we are not able to go further with the Anova test, we will use the Kruskal-Wallis test instead. (not coded due running out of time.)
Check for Outliers
Use the outcome of your analysis to deepen your research.
Indicate the challenges you faced in the process. [0-20]
z_barley_price = np.abs(stats.zscore(df['barley_price']))
z_wheat_price = np.abs(stats.zscore(df['wheat_price']))
z_beef_price = np.abs(stats.zscore(df['beef_price']))
z_pig_price = np.abs(stats.zscore(df['pig_price']))
z_oats_price = np.abs(stats.zscore(df['oats_price']))
z_rye_price = np.abs(stats.zscore(df['rye_price']))
df['Zscore_barley_price'] = z_barley_price
df['Zscore_wheat_price'] = z_wheat_price
df['Zscore_beef_price'] = z_beef_price
df['Zscore_pig_price'] = z_pig_price
df['Zscore_oats_price'] = z_oats_price
df['Zscore_rye_price'] = z_rye_price
Check distributions of the data
df.shape
(1241, 15)
for f in NumericalColumns:
custom_barplot( df1=df, col1=f)
df_new = df[(df['Zscore_barley_price'] < 3) & (df['Zscore_wheat_price'] < 3) & (df['Zscore_beef_price'] < 3) & (df['Zscore_pig_price'] < 3) & (df['Zscore_oats_price']) & (df['Zscore_rye_price'])] # remove outliers in all 3 columns
display(df_new.shape)
for f in ['year', 'weekNumber', 'barley_price', 'wheat_price', 'beef_price', 'pig_price', 'oats_price', 'rye_price']:
custom_barplot( df1=df_new, col1=f);
for g in ['barley_price', 'wheat_price', 'oats_price', 'rye_price']:
custom_scatterplot(df_new, col_x=f, col_y=g);
(1217, 15)
C:\Users\stehayes\AppData\Local\Temp\ipykernel_6032\1669066834.py:172: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`. f, ax = plt.subplots(figsize=(11.5, 11.5))
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
<Figure size 1200x500 with 0 Axes>
All data frame changes go in this section, including the reasons for making the changes.
Taking into consideration the tasks required in the machine learning section, use appropriate data cleaning, engineering, extraction and/or other techniques to structure and enrich your data.
Rationalize your decisions and implementation, including evidence of how your process has addressed the problems identified in the EDA (Exploratory Data Analysis) stage and how your structured data will assist in the analysis stage.
This should include visualizations to illustrate your work and evidence to support your methodology.[0-30]
df_new.drop(['Zscore_barley_price', 'Zscore_wheat_price', 'Zscore_beef_price', 'Zscore_pig_price', 'Zscore_oats_price', 'Zscore_rye_price'], axis=1, inplace=True)
df_new.to_csv('./Data/Final/grain_meat_prices.csv', index=False)
C:\Users\stehayes\AppData\Local\Temp\ipykernel_6032\3627159841.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_new.drop(['Zscore_barley_price', 'Zscore_wheat_price', 'Zscore_beef_price', 'Zscore_pig_price', 'Zscore_oats_price', 'Zscore_rye_price'], axis=1, inplace=True)
df_text = pd.read_csv('./Data/Final/text_data.csv')
df_text.head()
| Key | Value | |
|---|---|---|
| 0 | 1962 | -w ■ - -r-irishjournal of thegrassland asso... |
| 1 | 1968 | research for farmip^s^^c £ ''<■corrigendia f... |
| 2 | 1969 | second edward richards orpen memorial lec... |
| 3 | 1970 | mmsome aspects of efficiency in beef product... |
| 4 | 1971 | irish grassland andchu'ii manimal productio... |
years = list(df['year'].unique())
# filter df_text just for the years in df
display(df_text.head())
df_text = df_text[df_text['Key'].isin(years)]
display(df_text.head())
| Key | Value | |
|---|---|---|
| 0 | 1962 | -w ■ - -r-irishjournal of thegrassland asso... |
| 1 | 1968 | research for farmip^s^^c £ ''<■corrigendia f... |
| 2 | 1969 | second edward richards orpen memorial lec... |
| 3 | 1970 | mmsome aspects of efficiency in beef product... |
| 4 | 1971 | irish grassland andchu'ii manimal productio... |
| Key | Value | |
|---|---|---|
| 35 | 2009 | irish grassland association journal2009 volu... |
| 36 | 2011 | 24 july 2011 ifa rural development chairman t... |
| 37 | 2012 | 16 may 2012 speaking as the main co-ops have ... |
| 38 | 2013 | 22 may 2013 ifa environment + rural affairs c... |
| 39 | 2014 | 11 september 2014 ifa rural development chair... |
df_text['sentiment'] = df_text['Value'].apply(sentiment_scores)
df_text.head()
Overall sentiment dictionary is : {'neg': 0.041, 'neu': 0.861, 'pos': 0.098, 'compound': 1.0}
Positive
Overall sentiment dictionary is : {'neg': 0.077, 'neu': 0.826, 'pos': 0.097, 'compound': 0.8878}
Positive
Overall sentiment dictionary is : {'neg': 0.1, 'neu': 0.802, 'pos': 0.098, 'compound': -0.988}
Negative
Overall sentiment dictionary is : {'neg': 0.059, 'neu': 0.835, 'pos': 0.106, 'compound': 0.9966}
Positive
Overall sentiment dictionary is : {'neg': 0.109, 'neu': 0.792, 'pos': 0.099, 'compound': -0.9347}
Negative
Overall sentiment dictionary is : {'neg': 0.009, 'neu': 0.918, 'pos': 0.073, 'compound': 0.9943}
Positive
Overall sentiment dictionary is : {'neg': 0.057, 'neu': 0.862, 'pos': 0.081, 'compound': 0.9971}
Positive
Overall sentiment dictionary is : {'neg': 0.072, 'neu': 0.828, 'pos': 0.1, 'compound': 0.9978}
Positive
Overall sentiment dictionary is : {'neg': 0.059, 'neu': 0.832, 'pos': 0.109, 'compound': 0.9995}
Positive
Overall sentiment dictionary is : {'neg': 0.148, 'neu': 0.763, 'pos': 0.088, 'compound': -0.9389}
Negative
Overall sentiment dictionary is : {'neg': 0.032, 'neu': 0.892, 'pos': 0.076, 'compound': 0.9992}
Positive
Overall sentiment dictionary is : {'neg': 0.032, 'neu': 0.868, 'pos': 0.1, 'compound': 0.999}
Positive
| Key | Value | sentiment | |
|---|---|---|---|
| 35 | 2009 | irish grassland association journal2009 volu... | 1.0000 |
| 36 | 2011 | 24 july 2011 ifa rural development chairman t... | 0.8878 |
| 37 | 2012 | 16 may 2012 speaking as the main co-ops have ... | -0.9880 |
| 38 | 2013 | 22 may 2013 ifa environment + rural affairs c... | 0.9966 |
| 39 | 2014 | 11 september 2014 ifa rural development chair... | -0.9347 |
df_text = df_text[['Key', 'sentiment']]
df_text.rename(columns={'Key': 'year'}, inplace=True)
df_text.head()
| year | sentiment | |
|---|---|---|
| 35 | 2009 | 1.0000 |
| 36 | 2011 | 0.8878 |
| 37 | 2012 | -0.9880 |
| 38 | 2013 | 0.9966 |
| 39 | 2014 | -0.9347 |
Collect and develop a dataset based on the agriculture topic related to Ireland as well as other parts of the world.
Perform a sentimental analysis for an appropriate agricultural topic (e.g., product price, feed quality etc…) for producers and consumers point of view in Ireland. [0 - 25]
Clustering, Linear regression, random forest.
Use of multiple models (at least two) to compare and contrast results and insights gained.
Describe the rationale and justification for the choice of machine learning models for the above-mentioned scenario.
Machine Learning models can be used for Prediction, Classification, Clustering, sentiment analysis, recommendation systems and Time series analysis.
You should plan on trying multiple approaches (at least two) with proper selection of hyperparameters using GridSearchCV method.
You can choose appropriate features from the datasets and a target feature to answer the question asked in the scenario in the case of supervised learning. [0 - 30]
df = pd.read_csv('./Data/Final/grain_meat_prices.csv')
df.columns.values
# Originally we merged the years sentiment with the price data, but this did not improve the models
# df = df.merge(df_text, on='year', how='right')
# df.head()
# add the sentiment column to the list of numerical columns
#NumericalColumns = set().union(NumericalColumns, ['sentiment'])
NumericalColumns = ['year', 'weekNumber', 'barley_price', 'wheat_price', 'beef_price', 'pig_price', 'oats_price', 'rye_price']
df = pd.get_dummies(df, columns=['memberStateCode'])
df.columns.values
array(['weekNumber', 'year', 'wheat_price', 'barley_price', 'oats_price',
'rye_price', 'pig_price', 'beef_price', 'memberStateCode_AT',
'memberStateCode_BE', 'memberStateCode_BG', 'memberStateCode_CY',
'memberStateCode_CZ', 'memberStateCode_DE', 'memberStateCode_EL',
'memberStateCode_ES', 'memberStateCode_FI', 'memberStateCode_FR',
'memberStateCode_HU', 'memberStateCode_IE', 'memberStateCode_IT',
'memberStateCode_LT', 'memberStateCode_LV', 'memberStateCode_NL',
'memberStateCode_PL', 'memberStateCode_PT', 'memberStateCode_RO',
'memberStateCode_SK', 'memberStateCode_UK'], dtype=object)
# need to scale the data before modeling
# MinMaxScaler scales the data between 0 and 1 i.e. (x - min(x)) / (max(x) - min(x))
scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df.head()
| weekNumber | year | wheat_price | barley_price | oats_price | rye_price | pig_price | beef_price | memberStateCode_AT | memberStateCode_BE | ... | memberStateCode_IE | memberStateCode_IT | memberStateCode_LT | memberStateCode_LV | memberStateCode_NL | memberStateCode_PL | memberStateCode_PT | memberStateCode_RO | memberStateCode_SK | memberStateCode_UK | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.500 | 0.000000 | 0.000000 | 0.000000 | 0.237161 | 0.100976 | 0.546860 | 0.135083 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 1 | 0.000 | 0.083333 | 0.070157 | 0.087134 | 0.248478 | 0.100976 | 0.523011 | 0.323000 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 2 | 0.125 | 0.083333 | 0.218744 | 0.150311 | 0.231672 | 0.271546 | 0.458357 | 0.226970 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 3 | 0.250 | 0.083333 | 0.271361 | 0.094292 | 0.401870 | 0.295867 | 0.409633 | 0.166172 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| 4 | 0.375 | 0.083333 | 0.493017 | 0.430035 | 0.389569 | 0.413643 | 0.519191 | 0.145661 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 29 columns
plt.figure(figsize=(20,30))
i = 1
for variable in NumericalColumns:
plt.subplot(5,3,i)
plt.boxplot(df[variable],whis=1.5)
shapiro_results = shapiro_test(df[variable])
title = variable + ' Shapiro Test ' + str(shapiro_results[0])
plt.title(title )
i = i + 1
plt.show()
shapiro_test(df.pig_price)
('failed', 'red', 0.00017267563089262694)
# model the beef price first
X = df.drop(['beef_price'], axis=1)
y = df['beef_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(973, 28) (244, 28) (973,) (244,)
You should train and test for Supervised Learning and other appropriate metrics for unsupervised/ semi-supervised machine learning models that you have chosen.
Use cross validation to provide authenticity of the modelling outcomes.
You can apply dimensionality reduction methods to prepare the dataset based on your machine learning modelling requirements. [0 - 30]
# these are the same reagardless of the target variable
folds = KFold(n_splits=10, shuffle=True, random_state=42)
# model the beef price first
X = df.drop(['beef_price'], axis=1)
y = df['beef_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# specify range of hyperparameters to tune
hyper_params = [{'n_features_to_select': list(range(1, len(X_train.columns)))}]
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(851, 28) (366, 28) (851,) (366,)
linear_regression(X_train, X_test, y_train, y_test, folds, hyper_params)
Fitting 10 folds for each of 27 candidates, totalling 270 fits
R-squared of the model in training set is: 0.8493403689776827
The optimal number of features is: {'n_features_to_select': 26}
Root mean squared error of the prediction is: 0.06620017138378621
-----Test set statistics-----
R-squared of the model in test set is: -1.3471524994443911e+22
Root mean squared error of the prediction is: 19210355969.79176
Mean absolute percentage error of the prediction is: 274385223408.36005
alpha = arange(0.000001, 1, 0.01)
ridge_regression(X_train, y_train, X_test, y_test, alpha, folds)
Fitting 10 folds for each of 100 candidates, totalling 1000 fits R-squared of the model in training set is: 0.8491155598478373 The optimal value of alpha is: 0.410001 Root mean squred error of the train set is: 0.06624954378437216 -----Test set statistics----- R-squared of the model in test set is: 0.8584844067889211 Root mean squared error of the prediction is: 0.062262876363436956 Mean absolute percentage error of the prediction is: 12.31332629787834
alpha = arange(1e-50, 1, 0.01)
lasso_regression(X_train, y_train, X_test, y_test, alpha, folds)
Fitting 10 folds for each of 100 candidates, totalling 1000 fits
C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\sklearn\linear_model\_coordinate_descent.py:634: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.865e+00, tolerance: 2.475e-03 Linear regression models with null weight for the l1 regularization term are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead. model = cd_fast.enet_coordinate_descent(
R-squared of the model in training set is: 0.8493476727413121 The optimal value of alpha is: 1e-50 Root mean squared error of the train set is: 0.06619856671948152 -----Test set statistics----- R-squared of the model in test set is: 0.8584815605267185 Root mean squared error of the prediction is: 0.062263502497902176 Mean absolute percentage error of the prediction is: 12.215551000743123
decision_tree(X_train, X_test, y_train, y_test)
R-squared of the model in training set is: 1.0 Root mean squared error of the training set is: 1.7126079809080954e-17 -----Test set statistics----- R-squared of the model in test set is: 0.8832450191757626 Root mean squared error of the prediction is: 0.0565541839717465 Mean absolute percentage error of the prediction is: 9.3096505984877
random_forest_regressor(X_train, X_test, y_train, y_test)
R-squared of the model in training set is: 0.9912550653141674 Mean squared error for train set: 0.04071035948615301 -----Test set statistics----- R-squared of the model in test set is: 0.935781871009006 Root mean squared error of the prediction is: 0.041942648990405944 Mean absolute percentage error of the prediction is: 8.10444476381119
# ## Vis random forest
# # Extract single tree
# estimator = regressor.estimators_[5]
# from sklearn.tree import export_graphviz
# # Export as dot file
# export_graphviz(estimator, out_file='tree.dot',
# feature_names = df.feature_names,
# class_names = df.target_names,
# rounded = True, proportion = False,
# precision = 2, filled = True)
# # Convert to png using system command (requires Graphviz)
# from subprocess import call
# call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# # Display in jupyter notebook
# from IPython.display import Image
# Image(filename = 'tree.png')
# model the pig price second
X = df.drop(['pig_price'], axis=1)
y = df['pig_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# specify range of hyperparameters to tune
hyper_params = [{'n_features_to_select': list(range(1, len(X_train.columns)))}]
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(851, 28) (366, 28) (851,) (366,)
linear_regression(X_train, X_test, y_train, y_test, folds, hyper_params)
Fitting 10 folds for each of 27 candidates, totalling 270 fits
R-squared of the model in training set is: 0.26940419667124327
The optimal number of features is: {'n_features_to_select': 22}
Root mean squared error of the prediction is: 0.16005977788951786
-----Test set statistics-----
R-squared of the model in test set is: -1.5533621382791133e+22
Root mean squared error of the prediction is: 22603973498.610405
Mean absolute percentage error of the prediction is: 129439969730.4195
alpha = arange(0.000001, 1, 0.01)
ridge_regression(X_train, y_train, X_test, y_test, alpha, folds)
Fitting 10 folds for each of 100 candidates, totalling 1000 fits R-squared of the model in training set is: 0.26849768079072067 The optimal value of alpha is: 0.48000099999999996 Root mean squred error of the train set is: 0.16015904738236986 -----Test set statistics----- R-squared of the model in test set is: 0.22306449592458466 Root mean squared error of the prediction is: 0.15986040753431852 Mean absolute percentage error of the prediction is: 37.533883878411366
alpha = arange(1e-50, 1, 0.01)
lasso_regression(X_train, y_train, X_test, y_test, alpha, folds)
Fitting 10 folds for each of 100 candidates, totalling 1000 fits
C:\Users\stehayes\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.8_qbz5n2kfra8p0\LocalCache\local-packages\Python38\site-packages\sklearn\linear_model\_coordinate_descent.py:634: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.090e+01, tolerance: 2.984e-03 Linear regression models with null weight for the l1 regularization term are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead. model = cd_fast.enet_coordinate_descent(
R-squared of the model in training set is: 0.269405038061163 The optimal value of alpha is: 1e-50 Root mean squared error of the train set is: 0.16005968572315046 -----Test set statistics----- R-squared of the model in test set is: 0.22391578541790924 Root mean squared error of the prediction is: 0.15977280387682388 Mean absolute percentage error of the prediction is: 37.324195535542124
decision_tree(X_train, X_test, y_train, y_test)
R-squared of the model in training set is: 1.0 Root mean squared error of the training set is: 1.1417386539387303e-17 -----Test set statistics----- R-squared of the model in test set is: 0.6141445727062276 Root mean squared error of the prediction is: 0.1126576631251324 Mean absolute percentage error of the prediction is: 21.876199934238375
random_forest_regressor(X_train, X_test, y_train, y_test)
R-squared of the model in training set is: 0.9660688569673888 Mean squared error for train set: 0.042653019077786555 -----Test set statistics----- R-squared of the model in test set is: 0.7792874025228431 Root mean squared error of the prediction is: 0.08520439413508035 Mean absolute percentage error of the prediction is: 20.068884670090416
A Table or graphics should be provided to illustrate the similarities and contrast of the Machine Learning modelling outcomes based on the scoring metric used for the analysis of the above-mentioned scenario.
Discuss and elaborate your understanding clearly. [0 - 15]
Lowest MSE for models. Look for models with simialar error scores for Training and Test sets this indicates that the model will perform well with unseen data.
One conclusion we can take is that this data set does not contain the data necessary to prdict pig prices with any accuracy, this is likely to be due to the fact that pigs are fed a mixture of cereal and proteins in the form of soya and we do not have soya prices in the data set, to follow on this project further would require the addition of soya and other plant protein price sources to the data set.
However for cattle prices the data seems to be adequate for predicting prices of adult beef cattle.
Additional notes : All: